# Project: Serbia Survey
# Authors: William O'Brochta and Patrick Cunha Silva
# Goal: STM Analysis on FB posts (President Aleksandar Vucuc's Pages)

rm(list = ls())

# Load Raw Data (note raw data are not provided because raw data are from Meta)
load(file = here('fb_data.RData'))

###############################################################
################## STM: Aleksandar Vucic ######################
###############################################################

# Aleksandar Vucic has two pages, one is mainly in Latin and another is in Cyrillic
# https://www.facebook.com/vucicaleksandar/
# https://www.facebook.com/buducnostsrbijeavucic/

# These pages only overlap for three years
aleksandar <-  subset(fb_data, owner == 'AleksandarVucic' & year %in% c(2019, 2020, 2021))

# Repare corpus
mystopwords <- c('s', '#39', '<', '>', 'rs', 'www', 
                 '8d', 'iä', 'org', 'https', 'http', 'ä', 'dr', 't', 
                 stopwords('english'), stopwords('russian'))
doc_pros_alec <- textProcessor(aleksandar$translation, 
                               metadata = aleksandar[, c('inCyrillic', 'owner', 
                                                         'leg_election', 'prez_election', 'Facebook.Id', 'year', 'id', 'date')], 
                               language = 'english',
                               removestopwords = TRUE, 
                               removenumbers = TRUE, 
                               removepunctuation = TRUE,
                               lowercase = TRUE, 
                               stem = FALSE, 
                               striphtml = TRUE, 
                               customstopwords = mystopwords,
                               onlycharacter = TRUE)
# Plot Removed Words
plotRemoved(doc_pros_alec[[1]], lower.thresh = seq(from = 10, to = 1000, by = 10)) 

# Remove Words that appear in less than 1% of the documents or in more than 90%
stmObj_alec <- prepDocuments(doc_pros_alec[[1]], doc_pros_alec[[2]], meta=doc_pros_alec$meta
                             , lower.thresh = round(nrow(aleksandar) * 0.01), 
                             upper.thresh = round(nrow(aleksandar) * 0.90))
docs_alec <- stmObj_alec$documents
vocab_alec <- stmObj_alec$vocab
meta_alec <- stmObj_alec$meta

####################################################
############ SI B.2 - Number of Topics #############
####################################################

#--------------- Uncomment Lines 259-264 to run the model again -------------#
# Search across number of topics
# storage_alec <- searchK(documents = docs_alec,
#     vocab = vocab_alec,
#     K = seq(5, 60, 5),
#     prevalence = ~ inCyrillic + leg_election + year,
#     data = meta_alec,
#     # cores = 8
#     )
# Save storage
# save(storage_alec, file = here("storage_alec.RData"))

# Load storage
load(file = here("storage_alec.RData"))

# We need to identify the number of topics. We use the euclidean distance
# Note that Exclusivity and Semantic Coherence are in different scales
# Thus, we first scale them.
y = (unlist(storage_alec$results$exclus))
x = (unlist(storage_alec$results$semcoh))
y = scale(unlist(storage_alec$results$exclus))
x = scale(unlist(storage_alec$results$semcoh))
eucledian <- sqrt((y - max(y))^2 + (x - max(x))^2)
round(cbind(unlist(storage_alec$results$K), y, x, eucledian), 3)

# Find number of topics
c(unlist(storage_alec$results$K))[which.min(sqrt((y - max(y))^2 + (x - max(x))^2))]

# Find ideal number of topics
par(mar = c(6, 6, 2, 2))
plot(y = unlist(storage_alec$results$exclus),
     x = unlist(storage_alec$results$semcoh),
     ylab = 'Exclusivity',
     xlab = 'Semantic Coherence',
     pch = 1:length(seq(5, 60, 5)),
     cex.lab = 1.75,
     cex.axis = 1.5)
legend('bottomleft', legend = c(unlist(storage_alec$results$K)),
       pch = 1:length(seq(5, 60, 5)))
par(mar = c(6, 6, 2, 2))
plot(y =y,
     x = x,
     ylab = 'Exclusivity',
     xlab = 'Semantic Coherence',
     pch = 1:length(seq(5, 60, 5)),
     cex.lab = 1.75,
     cex.axis = 1.5)
legend('bottomleft', legend = c(unlist(storage_alec$results$K)),
       pch = 1:length(seq(5, 60, 5)))

# K = 20 seems to be the best option, given that it is a good balance between 
# Semantic Coherence, Exclusivity, and Residual

######################################
############ Main Model ##############
######################################

#--------------- Uncomment Lines Below to run the model again -------------#
# Run STM
# matDoc_alec <- stm(documents = docs_alec,
#     vocab = vocab_alec,
#     K = 20,
#     data = meta_alec,
#     prevalence = ~ inCyrillic + leg_election + year,
#     init.type = 'Spectral'
# )

# Save model
# save(matDoc_alec, file = here('outModel_alec.RData'))

# Load model 
load(file = here('outModel_alec.RData'))

# Label these topics
labelTopics(matDoc_alec)
# Highest Prob: are the words within each topic with the highest probability 
# FREX: are the words that are both frequent and exclusive

# Highly associated documents
findThoughts(matDoc_alec, 
             texts = aleksandar$translation[aleksandar$id %in% meta_alec$id], n = 2)


#####Topic Names
# Topic 1: President
# Topic 2: Belgrade
# Topic 3: Kosovo
# Topic 4: Infrastructure
# Topic 5: China
# Topic 6: Military
# Topic 7: Serbian Heritage
# Topic 8: Standard of Living
# Topic 9: Covid 19
# Topic 10: Peace, No Conflict
# Topic 11: Russia
# Topic 12: Institutional Visits
# Topic 13: International Cooperation
# Topic 14: Public Policy
# Topic 15: Development/Investments
# Topic 16: Economy
# Topic 17: Presidential meetings
# Topic 18: Foreign Affairs
# Topic 19: Public Festivities and Announcements
# Topic 20: Economy (International)


# Estimated Effect of Cyrillic
meta_alec$year <- as.character(meta_alec$year)
out_alec <- estimateEffect(formula = 1:20 ~ inCyrillic + leg_election + year,  
                           metadata = meta_alec, stmobj = matDoc_alec, uncertainty = 'Global', prior = 1e-5)

# Plot the results
par(mar = c(5, 17, 1, 1))
plot(out_alec, covariate = 'inCyrillic', 
     cov.value1 = 1, cov.value2 = 0, 
     method = 'difference', 
     labeltype = 'custom', 
     ci.level = 0.95,
     axes = FALSE,
     xlim = c(-0.25, 0.10),
     custom.labels = '',
     xlab = 'Difference in Topic Proportion',
     cex.lab = 1.5
)
mylabels <- c (
  'President (himself)'
  , 'Belgrade'
  , "Kosovo"
  , "Infrastructure"
  , "China"
  , "Military"
  , "Serbian Heritage"
  , "Standard of Living"
  , "Covid-19"
  , "Peace, No Conflict"
  , "Russia"
  , "Institutional Visits"
  , "International Cooperation"
  , "Public Policy"
  , "Development/Investments"
  , "Economy"
  , "Presidential Meetings"
  , "Foreign Affairs"
  , "Festivities and Announcements"
  , "Economy (International)"
)
axis(2, at = 1:20, labels = rev(mylabels), las = 2, cex.axis = 1.25)
axis(1, at = seq(-0.20, 0.10, by = 0.05), labels = c(-0.2, '', 
                                                     -0.1, '', 0, '', 0.1), cex.axis = 1.25)

###################################################
############ SI B.6 - Complete Results ############
###################################################

# Create tables, we will do 4 tables, each will have 5 topics.
toTable <- matrix(NA, ncol = 5, nrow = 8) # Topics 1-5
toTable[seq(1, 6, 2), 1] <- round(unname(summary(out_alec)[['tables']][[1]][1:3, 'Estimate']), 3) # Betas 1
toTable[seq(1, 6, 2), 2] <- round(unname(summary(out_alec)[['tables']][[2]][1:3, 'Estimate']), 3) # Betas 2
toTable[seq(1, 6, 2), 3] <- round(unname(summary(out_alec)[['tables']][[3]][1:3, 'Estimate']), 3) # Betas 3
toTable[seq(1, 6, 2), 4] <- round(unname(summary(out_alec)[['tables']][[4]][1:3, 'Estimate']), 3) # Betas 4
toTable[seq(1, 6, 2), 5] <- round(unname(summary(out_alec)[['tables']][[5]][1:3, 'Estimate']), 3) # Betas 5
toTable[seq(2, 6, 2), 1] <- paste0('(', round(unname(summary(out_alec)[['tables']][[1]][1:3, 'Std. Error']), 3), ')') # SE 1
toTable[seq(2, 6, 2), 2] <- paste0('(', round(unname(summary(out_alec)[['tables']][[2]][1:3, 'Std. Error']), 3), ')') # SE 2
toTable[seq(2, 6, 2), 3] <- paste0('(', round(unname(summary(out_alec)[['tables']][[3]][1:3, 'Std. Error']), 3), ')') # SE 4
toTable[seq(2, 6, 2), 4] <- paste0('(', round(unname(summary(out_alec)[['tables']][[4]][1:3, 'Std. Error']), 3), ')') # SE 4
toTable[seq(2, 6, 2), 5] <- paste0('(', round(unname(summary(out_alec)[['tables']][[5]][1:3, 'Std. Error']), 3), ')') # SE 5
toTable[7, ] <- 'Yes'
toTable[8, ] <- nrow(meta_alec)
colnames(toTable) <- paste("Topic", 1:5)
rownames(toTable) <- c('Intercept', '', 'Cyrillic', '',
                       'Legislative Election', '',
                       'FE by Year', 'N')
stargazer::stargazer(toTable, style = 'apsr', type = 'text')

toTable <- matrix(NA, ncol = 5, nrow = 8) # Topics 6-10
toTable[seq(1, 6, 2), 1] <- round(unname(summary(out_alec)[['tables']][[6]][1:3, 'Estimate']), 3) # Betas 6
toTable[seq(1, 6, 2), 2] <- round(unname(summary(out_alec)[['tables']][[7]][1:3, 'Estimate']), 3) # Betas 7
toTable[seq(1, 6, 2), 3] <- round(unname(summary(out_alec)[['tables']][[8]][1:3, 'Estimate']), 3) # Betas 8
toTable[seq(1, 6, 2), 4] <- round(unname(summary(out_alec)[['tables']][[9]][1:3, 'Estimate']), 3) # Betas 9
toTable[seq(1, 6, 2), 5] <- round(unname(summary(out_alec)[['tables']][[10]][1:3, 'Estimate']), 3) # Betas 10
toTable[seq(2, 6, 2), 1] <- paste0('(', round(unname(summary(out_alec)[['tables']][[6]][1:3, 'Std. Error']), 3), ')') # SE 6
toTable[seq(2, 6, 2), 2] <- paste0('(', round(unname(summary(out_alec)[['tables']][[7]][1:3, 'Std. Error']), 3), ')') # SE 7
toTable[seq(2, 6, 2), 3] <- paste0('(', round(unname(summary(out_alec)[['tables']][[8]][1:3, 'Std. Error']), 3), ')') # SE 8
toTable[seq(2, 6, 2), 4] <- paste0('(', round(unname(summary(out_alec)[['tables']][[9]][1:3, 'Std. Error']), 3), ')') # SE 9
toTable[seq(2, 6, 2), 5] <- paste0('(', round(unname(summary(out_alec)[['tables']][[10]][1:3, 'Std. Error']), 3), ')') # SE 10
toTable[7, ] <- 'Yes'
toTable[8, ] <- nrow(meta_alec)
colnames(toTable) <- paste("Topic", 6:10)
rownames(toTable) <- c('Intercept', '', 'Cyrillic', '',
                       'Legislative Election', '',
                       'FE by Year', 'N')
stargazer::stargazer(toTable, style = 'apsr', type = 'text')

toTable <- matrix(NA, ncol = 5, nrow = 8) # Topics 11-15
toTable[seq(1, 6, 2), 1] <- round(unname(summary(out_alec)[['tables']][[11]][1:3, 'Estimate']), 3) # Betas 11
toTable[seq(1, 6, 2), 2] <- round(unname(summary(out_alec)[['tables']][[12]][1:3, 'Estimate']), 3) # Betas 12
toTable[seq(1, 6, 2), 3] <- round(unname(summary(out_alec)[['tables']][[13]][1:3, 'Estimate']), 3) # Betas 13
toTable[seq(1, 6, 2), 4] <- round(unname(summary(out_alec)[['tables']][[14]][1:3, 'Estimate']), 3) # Betas 14
toTable[seq(1, 6, 2), 5] <- round(unname(summary(out_alec)[['tables']][[15]][1:3, 'Estimate']), 3) # Betas 15
toTable[seq(2, 6, 2), 1] <- paste0('(', round(unname(summary(out_alec)[['tables']][[11]][1:3, 'Std. Error']), 3), ')') # SE 11
toTable[seq(2, 6, 2), 2] <- paste0('(', round(unname(summary(out_alec)[['tables']][[12]][1:3, 'Std. Error']), 3), ')') # SE 12
toTable[seq(2, 6, 2), 3] <- paste0('(', round(unname(summary(out_alec)[['tables']][[13]][1:3, 'Std. Error']), 3), ')') # SE 13
toTable[seq(2, 6, 2), 4] <- paste0('(', round(unname(summary(out_alec)[['tables']][[14]][1:3, 'Std. Error']), 3), ')') # SE 14
toTable[seq(2, 6, 2), 5] <- paste0('(', round(unname(summary(out_alec)[['tables']][[15]][1:3, 'Std. Error']), 3), ')') # SE 15
toTable[7, ] <- 'Yes'
toTable[8, ] <- nrow(meta_alec)
colnames(toTable) <- paste("Topic", 11:15)
rownames(toTable) <- c('Intercept', '', 'Cyrillic', '',
                       'Legislative Election', '', 
                       'FE by Year', 'N')
stargazer::stargazer(toTable, style = 'apsr', type = 'text')

toTable <- matrix(NA, ncol = 5, nrow = 8) # Topics 16-20
toTable[seq(1, 6, 2), 1] <- round(unname(summary(out_alec)[['tables']][[16]][1:3, 'Estimate']), 3) # Betas 16
toTable[seq(1, 6, 2), 2] <- round(unname(summary(out_alec)[['tables']][[17]][1:3, 'Estimate']), 3) # Betas 17
toTable[seq(1, 6, 2), 3] <- round(unname(summary(out_alec)[['tables']][[18]][1:3, 'Estimate']), 3) # Betas 18
toTable[seq(1, 6, 2), 4] <- round(unname(summary(out_alec)[['tables']][[19]][1:3, 'Estimate']), 3) # Betas 19
toTable[seq(1, 6, 2), 5] <- round(unname(summary(out_alec)[['tables']][[20]][1:3, 'Estimate']), 3) # Betas 20
toTable[seq(2, 6, 2), 1] <- paste0('(', round(unname(summary(out_alec)[['tables']][[16]][1:3, 'Std. Error']), 3), ')') # SE 16
toTable[seq(2, 6, 2), 2] <- paste0('(', round(unname(summary(out_alec)[['tables']][[17]][1:3, 'Std. Error']), 3), ')') # SE 17
toTable[seq(2, 6, 2), 3] <- paste0('(', round(unname(summary(out_alec)[['tables']][[18]][1:3, 'Std. Error']), 3), ')') # SE 18
toTable[seq(2, 6, 2), 4] <- paste0('(', round(unname(summary(out_alec)[['tables']][[19]][1:3, 'Std. Error']), 3), ')') # SE 19
toTable[seq(2, 6, 2), 5] <- paste0('(', round(unname(summary(out_alec)[['tables']][[20]][1:3, 'Std. Error']), 3), ')') # SE 20
toTable[7, ] <- 'Yes'
toTable[8, ] <- nrow(meta_alec)
colnames(toTable) <- paste("Topic", 16:20)
rownames(toTable) <- c('Intercept', '', 'Cyrillic', '',
                       'Legislative Election', '',
                       'FE by Year', 'N')
stargazer::stargazer(toTable, style = 'apsr', type = 'text')
